In [1]:
import numpy as np
import pandas as pd

from datetime import datetime as dt

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import GradientBoostingRegressor

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split, GridSearchCV

In [2]:
path_to_dataset = 'D:/path/to/dataset.xlsx'


# Set the "random_state"
rs = 2023

# Load the dataset
dataset = pd.read_excel(path_to_dataset)

# Columns and row numbers for correct extracting research data
row_start = 3

WaterCut  = 3
MFR_obs   = 7
D_obs     = 8
DD_obs    = 9

MFR_err   = 10
DD_err    = 12

# Extracting research data in numpy.array
X = dataset.iloc[row_start:, [WaterCut, MFR_obs, DD_obs]].values.astype(float)
Y = dataset.iloc[row_start:, [MFR_err, DD_err]].values.astype(float)

# Split data into train/test set
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.25, random_state=rs)

In [6]:
# Set the parameter grids for each estimator

n_estimators      = [50,100,150,200,250,300,400,500,600,800,1000,1200] 
min_samples_split = np.arange(2,11,1) 
min_samples_leaf  = np.arange(1,11,1)
max_depth         = np.append(np.arange(3,20), None) 


param_grid_RF = {'n_estimators'     :n_estimators,
                 'criterion'        :['squared_error', 'absolute_error', 'friedman_mse'],
                 'min_samples_split':min_samples_split,
                 'min_samples_leaf' :min_samples_leaf,
                 'max_depth'        :max_depth
                }
param_grid_ET = {'n_estimators'     :n_estimators,
                 'criterion'        :['squared_error', 'absolute_error', 'friedman_mse'],
                 'min_samples_split':min_samples_split,
                 'min_samples_leaf' :min_samples_leaf,
                 'max_depth'        :max_depth
                } 

param_grid_GB = {'learning_rate'    :[0.01,0.05,0.10,0.15,0.20],   
                 'n_estimators'     :n_estimators,
                 'subsample'        :np.arange(0.1,1.1,0.1),           
                 'criterion'        :['squared_error', 'friedman_mse'], 
                 'min_samples_split':min_samples_split,
                 'min_samples_leaf' :min_samples_leaf,
                 'max_depth'        :max_depth
                }


grids = {'RandomForestRegressor'    :param_grid_RF,
         'ExtraTreesRegressor'      :param_grid_ET,
         'GradientBoostingRegressor':param_grid_GB,
        }


# Set the estimators list

estimators = [#RandomForestRegressor,
              #ExtraTreesRegressor,
              GradientBoostingRegressor
             ]

In [8]:
# Search for optimal hyperparameters

path_to_res = 'D:/path/to/results/'

# For each estimator
for estimator in estimators:
    est_name = str(estimator).split('.')[-1][:-2] # set the current estimator name
    param_grid = grids[est_name]                  # set the param grid
    res = pd.DataFrame()                          # to keep the estimator's results
    
    # For each "y"
    for column in [0,1]:
        start = dt.now()

        # Using GridSearchCV to search for best parameters
        gscv = GridSearchCV(estimator(random_state=rs),
                            param_grid,
                            scoring='neg_mean_absolute_error',
                            n_jobs=-3, # leave 2 jobs free
                            cv=5,      # default
                            verbose=4
                           ).fit(X_train, y_train[:,column])
        
        end = dt.now()
        print('Executing time for '+est_name+'_'+['MFR','DD'][column]+':',end-start)

        # Results
        mae = mean_absolute_error(y_test[:,column], gscv.best_estimator_.predict(X_test))
        res.loc[['MFR','DD'][column], 'MAE'] = np.round(mae, 3)
        for param in gscv.best_params_:
            res.loc[['MFR','DD'][column], param] = gscv.best_params_[param]
            
    res.to_csv(path_to_res + est_name + '.csv')
res

Fitting 5 folds for each of 1944000 candidates, totalling 9720000 fits
Executing time for GradientBoostingRegressor_MFR: 2 days, 9:45:35.339726
Fitting 5 folds for each of 1944000 candidates, totalling 9720000 fits
Executing time for GradientBoostingRegressor_DD: 2 days, 9:55:08.471020


Unnamed: 0,MAE,criterion,learning_rate,max_depth,min_samples_leaf,min_samples_split,n_estimators,subsample
MFR,1.067,squared_error,0.05,10.0,1.0,2.0,500.0,0.5
DD,0.694,squared_error,0.01,14.0,1.0,3.0,1200.0,0.2
